Exercises from Udacity’s Exploratory Data Analysis in R MOOC
#set up chart theme function and load required packages
chart.theme.size = 14
#theme_set(theme_minimal(chart.theme.size)) #minimal theme as default
source("../r_config/r_env_setup.R") #load packages and cutom ggplot theme function and presets
primary_color = color_primary_pal
highlight_color = color_highlight_pal
set_ggplot_theme01 <- do.call(chart_theme_minimal, as.list(chart_format_default))
#load the reddit dataset
reddit_df <- read.csv("../data/reddit.csv", header = T,
stringsAsFactors=TRUE, sep=",",
nrow = 1000)
#Number of rows and columns
dim(reddit_df)
## [1] 1000 14
#List of variables
names(reddit_df)
## [1] "id" "gender" "age.range"
## [4] "marital.status" "employment.status" "military.service"
## [7] "children" "education" "country"
## [10] "state" "income.range" "fav.reddit"
## [13] "dog.cat" "cheese"
#structure of the dataset
str(reddit_df)
## 'data.frame': 1000 obs. of 14 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ gender : int 0 0 1 0 1 0 0 0 0 0 ...
## $ age.range : Factor w/ 7 levels "18-24","25-34",..: 2 2 1 2 2 2 2 1 3 2 ...
## $ marital.status : Factor w/ 5 levels "Engaged","Forever Alone",..: NA NA NA NA NA 4 3 4 4 3 ...
## $ employment.status: Factor w/ 6 levels "Employed full time",..: 1 1 2 2 1 1 1 4 1 2 ...
## $ military.service : Factor w/ 2 levels "No","Yes": NA NA NA NA NA 1 1 1 1 1 ...
## $ children : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ education : Factor w/ 7 levels "Associate degree",..: 2 2 5 2 2 2 5 2 2 5 ...
## $ country : Factor w/ 60 levels "Australia","Barbados",..: 56 56 56 56 56 56 20 56 56 20 ...
## $ state : Factor w/ 46 levels "","Alabama","Alaska",..: 32 32 44 32 6 32 1 6 32 1 ...
## $ income.range : Factor w/ 8 levels "$100,000 - $149,999",..: 2 2 8 2 7 2 NA 7 2 7 ...
## $ fav.reddit : Factor w/ 244 levels "","4chan","adviceanimals",..: 97 94 203 206 21 94 178 76 222 1 ...
## $ dog.cat : Factor w/ 3 levels "I like cats.",..: NA NA NA NA NA 2 2 2 1 1 ...
## $ cheese : Factor w/ 11 levels "American","Brie",..: NA NA NA NA NA 3 3 1 10 7 ...
#levels of age range
levels(reddit_df$age.range)
## [1] "18-24" "25-34" "35-44" "45-54" "55-64"
## [6] "65 or Above" "Under 18"
#count summary of age range
table(reddit_df$age.range)
##
## 18-24 25-34 35-44 45-54 55-64 65 or Above
## 477 384 50 11 5 2
## Under 18
## 69
#barblot for age range. Selective coloring of age ranges where count > 300
ggplot(data = reddit_df, aes(x = age.range)) +
#geom_bar_custom +
geom_bar(aes(fill = ..count.. > 300)) +
scale_fill_brewer(palette="Set1") +
#geom_hline(yintercept=0, size=0.75, color="grey") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
chart_theme_bar +
chart_footer
chart_footnote("Source: Sample Reddit data\nNote*: <add text>")
#p1<-
ggplot(data = reddit_df, aes(x = age.range)) +
geom_bar_custom +
#geom_hline(yintercept=0, size=0.75, color="grey") +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
chart_theme_bar +
chart_footer
chart_footnote("Source: Sample Reddit data\nNote*: <add text>")
#ggsave("random_data.png", g, width=12, height=9)
#Notice that under18 bucket is at the end which is non-intuitive
reddit_df$age.range <- ordered(reddit_df$age.range,
levels=c("Under 18", "18-24", "25-34", "35-44", "45-54",
"55-64", "65 or Above" ))
#or alternatively
reddit_df$age.range <- factor(reddit_df$age.range,
levels=c("Under 18", "18-24", "25-34", "35-44", "45-54",
"55-64", "65 or Above" ),
ordered = TRUE)
#barblot for age range after ordering the factor variable
ggplot(data = reddit_df, aes(x = age.range)) +
do.call(chart_theme_minimal, as.list(chart_format_bar2)) +
geom_bar(colour = primary_color[1], fill = primary_color[1], alpha = 0.5, size = 0.25) +
theme(axis.text.x = element_text(angle=90, hjust=1, vjust=0.5)) +
geom_hline(yintercept=0, size=0.75, color="grey")
#Notice that under18 bucket is placed as the first bar now
#read the dataset into R
fb_df <- read.csv("../data/pseudo_facebook.tsv", header = TRUE,
stringsAsFactors = FALSE, sep="\t")
#take a peek into the fb data
glimpse(fb_df)
## Observations: 99003
## Variables:
## $ userid (int) 2094382, 1192601, 2083884, 1203168, 1733...
## $ age (int) 14, 14, 14, 14, 14, 14, 13, 13, 13, 13, ...
## $ dob_day (int) 19, 2, 16, 25, 4, 1, 14, 4, 1, 2, 22, 1,...
## $ dob_year (int) 1999, 1999, 1999, 1999, 1999, 1999, 2000...
## $ dob_month (int) 11, 11, 11, 12, 12, 12, 1, 1, 1, 2, 2, 2...
## $ gender (chr) "male", "female", "male", "female", "mal...
## $ tenure (int) 266, 6, 13, 93, 82, 15, 12, 0, 81, 171, ...
## $ friend_count (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ friendships_initiated (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ likes_received (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ mobile_likes_received (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ www_likes_received (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
#Create a custom color scale
myColors <- brewer.pal(3,"Set1")
fb_df$gender = factor(fb_df$gender)
#assign factor levels as names to the custom color scale
names(myColors) <- levels(fb_df$gender)
#create an alias for the custom color / fill scale
colScalegender <- scale_colour_manual(name = "gender",values = myColors)
fillScalegender <- scale_fill_manual(name = "gender",values = myColors)
#histogram of b'days
ggplot(data = fb_df, aes(x = dob_day)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
xlab("Day of birth") + ylab("# of members") +
ggtitle("Member Distribution by Day of Birth") +
scale_x_discrete(breaks = seq(1,31, 2)) +
scale_y_continuous(labels = comma) +
chart_footer +
geom_hline(yintercept=0, size=0.75, color="grey") +
geom_vline(xintercept=1, size=0.75, color = primary_color[1]) +
annotate("text", x = 2, y = 6500,
label =
"Abnormally high counts for 1st day of the month\nSeveral member may be selecting the first\ndropdown choice when selecting the day of birth",
size = text.size,color = primary_color[1],
hjust = 0)
chart_footnote()
#faceted by month
#histogram of b'days
ggplot(data = fb_df, aes(x = dob_day)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
xlab("Day of birth") + ylab("# of members") +
ggtitle("Member Distribution by Day of Birth faceted by Month") +
scale_y_continuous(labels = comma) +
scale_x_discrete(breaks = seq(1,31, 8)) +
geom_hline(yintercept=0, size=0.75, color="grey")+
facet_wrap(~dob_month, ncol = 4)
chart_footnote()
#faceted by month and using density (area under each plot is 1) and free scales
#histogram of b'days
ggplot(data = fb_df, aes(x = dob_day)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(aes(y=..density..), colour = "white",
fill = primary_color[1], alpha = 0.4, size = 0.25) +
scale_y_continuous(labels = percent) +
xlab("Day of birth") + ylab("# of members") +
ggtitle("Member Distribution by Day of Birth faceted by Month") +
scale_x_discrete(breaks = seq(1,31, 8)) +
geom_hline(yintercept=0, size=0.75, color="grey")+
facet_wrap(~dob_month, scales = "free", ncol = 4)
#Histogram of friend count (notice that data is concentrated at Zero with few outliers)
ggplot(data = fb_df, aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
scale_y_continuous(labels = comma) + scale_x_continuous(labels = comma)
#Friend count after zooming into the previous chart using coord_cartesian
ggplot(data = fb_df, aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma) +
coord_cartesian(xlim= c(0, 1000))
#Friend count after limiting the data using limits
ggplot(data = fb_df, aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma, limits= c(0, 1000))
#Friend count after limiting the data using limits and binwidth = 25, and breaks at 50
ggplot(data = fb_df, aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(binwidth = 25, colour = "white", fill = primary_color[1],
alpha = 0.4, size = 0.25) +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma,
limits= c(0, 1000), breaks = seq(0, 1000, 100))
#Friend count faceted by gender
ggplot(data = fb_df, aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(binwidth = 25, colour = "white", fill = primary_color[1],
alpha = 0.4, size = 0.25) +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma,
limits= c(0, 1000), breaks = seq(0, 1000, 300)) +
facet_wrap(~gender)
#Friend count fill by gender
ggplot(data = filter(fb_df, !is.na(gender)), # remove records where gender is NA
aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_density(aes(group = gender, fill = gender, colour = gender),
binwidth = 25, #colour = "white", #fill = primary_color[1],
alpha = 0.1, size = 0.25) +
scale_y_continuous() +
scale_x_continuous(labels = comma,
limits= c(0, 1000), breaks = seq(0, 1000, 100))
#Friend count fill by gender using specified color palettes
ggplot(data = na.omit(fb_df), # remove records where any field is NA
aes(x=friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_density(aes(group = gender, fill = gender, colour = gender),
binwidth = 25, #colour = "white", #fill = primary_color[1],
alpha = .2, size = .25) +
#scale_fill_manual(values=highlight_color) + # for manually defined palette
#scale_color_manual(values=highlight_color) +
scale_fill_brewer(palette = "Set1") +
scale_color_brewer(palette = "Set1") +
scale_y_continuous() +
scale_x_continuous(labels = comma,
limits= c(0, 1000), breaks = seq(0, 1000, 100))
#count of levels in gender
fb_df %>%
group_by(gender) %>%
summarise(count = n())
## Source: local data frame [3 x 2]
##
## gender count
## 1 female 40254
## 2 male 58574
## 3 NA 175
#min value of each variable by gender
fb_df %>%
na.omit() %>% #filter out rows with missing values in any variable
group_by(gender) %>%
summarise_each(funs(min))
## Source: local data frame [2 x 15]
##
## gender userid age dob_day dob_year dob_month tenure friend_count
## 1 female 1000008 13 1 1900 1 0 0
## 2 male 1000038 13 1 1900 1 0 0
## Variables not shown: friendships_initiated (int), likes (int),
## likes_received (int), mobile_likes (int), mobile_likes_received (int),
## www_likes (int), www_likes_received (int)
#Histogram for tenure
ggplot(data = fb_df, aes(x = tenure/365)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(binwidth = 1/4, colour = "white", fill = primary_color[1],
alpha = .3, size = .25) +
xlab("Tenure (years)") +
ylab("# of Facebook members") +
ggtitle("Distribution of members by tenure") +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma,
limits= c(0, 7), breaks = seq(0, 7, 1)) +
geom_hline(yintercept=0, size=0.75, color="grey") +
annotate("text", x = Inf, y = -Inf, label = "sumitbajaj.me",
hjust=1.1, vjust= -.5, col="gray", cex=4, alpha = 0.8)
#histogram for user age
ggplot(data = fb_df, aes(x = age)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1],
alpha = 0.3, size = .25) +
geom_hline(yintercept=0, size = 0.75, color = "grey") +
scale_y_continuous(labels = comma) +
ggtitle("Distribution of members by Age") +
xlab("member age") +
ylab("# of Facebook members")
#histogram for user age with adjusted binwidth
#A binwidth of 1 allows us to visualize any unusual spikes in the data
ggplot(data = fb_df, aes(x = age)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1],
alpha = 0.3, size = .25,
binwidth = 1) +
geom_hline(yintercept=0, size = 0.75, color = "grey") +
scale_y_continuous(labels = comma) +
ggtitle("Distribution of members by Age") +
xlab("member age") +
ylab("# of Facebook members") +
scale_x_discrete(labels = comma, breaks = seq(min(fb_df$age), max(fb_df$age), 10))
summary(fb_df$friend_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 31.0 82.0 196.4 206.0 4923.0
#Log transform for the # of likes
summary(log10(fb_df$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -Inf 1 2 -Inf 2 4
#Log transform for the # of likes +1 to avoid infinity at log zero
summary(log10(fb_df$friend_count + 1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
# visualize friend count
p1 <-
ggplot(data = fb_df, aes(x = friend_count)) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1],
alpha = 0.3, size = .25) +
scale_y_continuous(labels = comma) +
scale_x_continuous(labels = comma)
#visualize friend count with log
p2 <-
ggplot(data = fb_df, aes(x = log10(friend_count + 1))) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1],
alpha = 0.3, size = .25) +
scale_y_continuous(labels = comma)
#visualize friend count with square root
p3 <-
ggplot(data = fb_df, aes(x = sqrt(friend_count))) +
do.call(chart_theme_minimal, as.list(chart_format_hist_no_vgrid)) +
geom_histogram(colour = "white", fill = primary_color[1],
alpha = 0.3, size = .25) +
scale_y_continuous(labels = comma)
grid.arrange(p1, p2, p3, ncol = 1)
#alternate way for log scale P2. Note that the x axis label doesn't reflect log
p1 + scale_x_log10()
Note that sum(..count..) will sum across color, so the percentages displayed are percentages of total users. To plot percentages within each group, you can try y = ..density…
ggplot(data = filter(fb_df, !is.na(gender)),
aes(x = friend_count)) +
do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
geom_histogram(aes(fill = gender),
binwidth = 10,
color = 'white',
alpha = 0.5, size = 0.25) +
fillScalegender +
scale_x_continuous(labels = comma,
lim = c(0, 1000)) +
scale_y_continuous(labels = comma) +
theme(legend.position = "right")
# frequency polygon - friend count
ggplot(data = filter(fb_df, !is.na(gender)),
aes(x = friend_count,
y = ..count../sum(..count..))) +
do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
geom_freqpoly(aes(color = gender, fill = gender),
binwidth = 10,
alpha = 1, size = 1.5) +
colScalegender +
#scale_color_brewer(palette = "Set1") +
scale_y_continuous(labels = percent) +
scale_x_continuous(labels = comma,
lim = c(0, 1000)) +
theme(legend.position = "right")
# frequency polygon - www_likes
p1 <-
ggplot(data = filter(fb_df, !is.na(gender)),
aes(x = www_likes)) +
do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
geom_freqpoly(aes(color = gender, fill = gender),
#binwidth = 10,
alpha = 1, size = .5) +
colScalegender +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
theme(legend.position = "right")
print(p1)
p1 + scale_x_log10() +
ylab("Log10Count")
#Who gets more likes males or females?
by(fb_df$www_likes, fb_df$gender, sum)
## fb_df$gender: female
## [1] 3507665
## --------------------------------------------------------
## fb_df$gender: male
## [1] 1430175
fb_df %>%
group_by(gender) %>%
summarise(num_www_likes = sum(www_likes),
n=n(),
likes_per_person = num_www_likes/n())
## Source: local data frame [3 x 4]
##
## gender num_www_likes n likes_per_person
## 1 female 3507665 40254 87.13830
## 2 male 1430175 58574 24.41655
## 3 NA 8590 175 49.08571
#Ans: Females get more likes overall and average female get more than 3x the # of male likes
#table <- xtable(freq,floating=FALSE)
#print(table, type = "HTML")
b1 <-
ggplot(data = filter(fb_df, !is.na(gender)),
aes(x = gender, y = friend_count)) +
geom_boxplot() +
do.call(chart_theme_minimal, as.list((chart_format_hist_no_vgrid["fsize"] = 14))) +
scale_y_continuous(labels = comma) +
xlab("")
print(b1)
#zoom to focus on users having 0-1000 friends
b1 +
coord_cartesian(ylim = c(0, 1000))
#zoom further focus on users having 0-250 friends
b1 +
coord_cartesian(ylim = c(0, 250))
#summary of friend_count by gender
by(fb_df$friend_count, fb_df$gender, summary)
## fb_df$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## fb_df$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
At times variables may have very sparse data e.g. # of mobile logins. In such cases, you may want to create a variable that just captures whether a given feature was ever used.
#Summary of mobile likes
summary(fb_df$mobile_likes)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 4.0 106.1 46.0 25110.0
#You can observe that more than 25% of users have zero mobile likes
summary(fb_df$mobile_likes > 0)
## Mode FALSE TRUE NA's
## logical 35056 63947 0
#More than 35% of users have no mobile likes
#mobile checkin flag variable creation
fb_df <- fb_df %>%
mutate(mobile_checkin = NA) %>%
mutate(mobile_checkin = ifelse(mobile_likes >0, 1, 0)) %>%
mutate(mobile_checkin = factor(mobile_checkin))
summary(fb_df$mobile_checkin)
## 0 1
## 35056 63947
#What % of people ever checked in using mobile
print(sum(fb_df$mobile_checkin == 1) / length(fb_df$mobile_checkin))
## [1] 0.6459097
dim(diamonds)
## [1] 53940 10
glimpse(diamonds)
## Observations: 53940
## Variables:
## $ carat (dbl) 0.23, 0.21, 0.23, 0.29, 0.31, 0.24, 0.24, 0.26, 0.22, ...
## $ cut (fctr) Ideal, Premium, Good, Premium, Good, Very Good, Very ...
## $ color (fctr) E, E, E, I, J, J, I, H, E, H, J, J, F, J, E, E, I, J,...
## $ clarity (fctr) SI2, SI1, VS1, VS2, SI2, VVS2, VVS1, SI1, VS2, VS1, S...
## $ depth (dbl) 61.5, 59.8, 56.9, 62.4, 63.3, 62.8, 62.3, 61.9, 65.1, ...
## $ table (dbl) 55, 61, 65, 58, 58, 57, 57, 55, 61, 61, 55, 56, 61, 54...
## $ price (int) 326, 326, 327, 334, 335, 336, 336, 337, 337, 338, 339,...
## $ x (dbl) 3.95, 3.89, 4.05, 4.20, 4.34, 3.94, 3.95, 4.07, 3.87, ...
## $ y (dbl) 3.98, 3.84, 4.07, 4.23, 4.35, 3.96, 3.98, 4.11, 3.78, ...
## $ z (dbl) 2.43, 2.31, 2.31, 2.63, 2.75, 2.48, 2.47, 2.53, 2.49, ...
ggplot(data = diamonds, aes(x = price)) +
geom_histogram(colour = "white", fill = primary_color[1], alpha = 0.4, size = 0.25) +
geom_vline(xintercept=mean(diamonds$price), size=0.75, color = "grey") +
annotate("text", x = mean(diamonds$price)*1.03, y = 15000,
label = "mean", hjust = 0) +
geom_vline(xintercept=median(diamonds$price), size=0.75, color = "grey") +
annotate("text", x = median(diamonds$price)*1.03, y = 12000,
label = "median", hjust = 0)
summary(diamonds$price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 326 950 2401 3933 5324 18820
#count by price points
diamonds %>%
mutate(price_pnt = ifelse(price < 250, "<250",
ifelse(price < 500, "<500",
ifelse(price >= 15000, ">15k", "other")))) %>%
group_by(price_pnt) %>%
summarise(n = n())
## Source: local data frame [3 x 2]
##
## price_pnt n
## 1 <500 1729
## 2 >15k 1656
## 3 other 50555
#scatterplot - relation between age and friend count
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
geom_point(alpha = 1,
color = primary_color[1])
#set alpha to 1/20 i.e 20 data points make one solid dot
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
geom_point(alpha = 1/20,
color = primary_color[1])
#limit the age from 13 to 90 yrs
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
geom_point(alpha = 1/20,
color = primary_color[1]) +
xlim(13, 90)
#add jitter to avoid straight vertical lines
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
geom_jitter(alpha = 1/20,
color = primary_color[1]) +
scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10))
#add square root Y axis to reduce right skew
#limit the age from 13 to 90 yrs
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
geom_jitter(alpha = 1/20,
position = position_jitter(h = 0),
color = primary_color[1]) +
scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10)) +
coord_trans(y = "sqrt")
The square root, x to x^(1/2) = sqrt(x), is a transformation with a moderate effect on distribution shape: it is weaker than the logarithm and the cube root. It is also used for reducing right skewness, and also has the advantage that it can be applied to zero values. Note that the square root of an area has the units of a length. It is commonly applied to counted data, especially if the values are mostly rather small.
If we add noise to zero we could end up with -ve numbers where sqrt will be imaginary. Set the position parameter equal to position_jitter and pass it a min height of zero.
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
geom_jitter(alpha = 1/20,
position = position_jitter(h = 0),
color = "light grey") +
scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10)) +
coord_trans(y = "sqrt") +
geom_line(aes(color = "Mean"), stat = "summary", fun.y = mean) +
geom_line(aes(color = "10% Quantile"), stat = "summary", fun.y = quantile,
probs = 0.1, linetype = 2) +
geom_line(aes(color = "90% Quantile"), stat = "summary", fun.y = quantile,
probs = 0.9, linetype = 2) +
#scale_color_manual(values=primary_color[1]) +
#annotate("text", x=max(fb_df$age)+1, y = mean(fb_df$friend_count), label = "mean", hjust = 0) +
theme(legend.position = "right") +
guides(colour = guide_legend(override.aes = list(size=1)))
#zoom in using coord-cartesian
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
set_ggplot_theme01 +
coord_cartesian(xlim = c(13, 70), ylim = c(0, 1000)) +
geom_jitter(alpha = 1/20,
position = position_jitter(h = 0),
color = "light grey") +
#scale_x_continuous(limits = c(13, 90), breaks = seq(13, 90, 10)) +
#coord_trans(y = "sqrt") +
geom_line(aes(color = "Mean"), stat = "summary", fun.y = mean) +
geom_line(aes(color = "10% Quantile"), stat = "summary", fun.y = quantile,
probs = 0.1, linetype = 2) +
geom_line(aes(color = "90% Quantile"), stat = "summary", fun.y = quantile,
probs = 0.9, linetype = 2) +
#scale_color_manual(values=primary_color[1]) +
#annotate("text", x=max(fb_df$age)+1, y = mean(fb_df$friend_count), label = "mean", hjust = 0) +
theme(legend.position = "right") +
guides(colour = guide_legend(override.aes = list(size=1)))
###Correlation A correlation of >0.3 is mild, >0.5 is moderate and >0.7 is pretty strong
cor.test(fb_df$age, fb_df$friend_count, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: fb_df$age and fb_df$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
#alternatively for less verbose code
with(fb_df, cor.test(age, friend_count, method = "pearson"))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
with(filter(fb_df, age<=70), cor.test(age, friend_count, method = "pearson"))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.5923, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1780220 -0.1654129
## sample estimates:
## cor
## -0.1717245
#spearman coeff for monotonic relationships
with(filter(fb_df, age<=70), cor.test(age, friend_count, method = "spearman"))
##
## Spearman's rank correlation rho
##
## data: age and friend_count
## S = 1.5782e+14, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2552934
engagement variables all measure essentiall the same thing - engagement. e.g. # of photo uploads, no. of posts etc..
ggplot(data = fb_df, aes(x = www_likes_received, y = likes_received)) +
set_ggplot_theme01 +
geom_point(position = position_jitter(h=0),
color = "light grey",
alpha = 1/20) +
xlim(0, quantile(fb_df$www_likes_received, 0.95)) +
ylim(0, quantile(fb_df$likes_received, 0.95)) +
geom_smooth(method = "lm", color = primary_color[1], size = 2) +
annotate("text", x = Inf, y = -Inf, label = "sumitbajaj.me",
hjust=1.1, vjust=-.5, col="gray", cex=4, alpha = 0.8)
#correlation coeff
cor.test(fb_df$www_likes_received, fb_df$likes_received, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: fb_df$www_likes_received and fb_df$likes_received
## t = 937.1035, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
#load the mitchel dataset for soil temp study
#install.packages("alr3")
library(alr3)
glimpse(Mitchell)
## Observations: 204
## Variables:
## $ Month (int) 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ Temp (dbl) -5.18333, -1.65000, 2.49444, 10.40000, 14.99440, 21.7167...
#Range of the month variable
range(Mitchell$Month)
## [1] 0 203
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
chart_theme01 +
geom_point() +
chart_footer+
scale_x_continuous(breaks = seq(0, 203, 12)) #increments of 12 months
with(Mitchell, cor.test(Month, Temp, method = "pearson"))
##
## Pearson's product-moment correlation
##
## data: Month and Temp
## t = 0.8182, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
#non parametric test
with(Mitchell, dcor.ttest(Month, Temp))
##
## dcor t-test of independence
##
## data: Month and Temp
## T = -0.939, df = 20501, p-value = 0.8261
## sample estimates:
## Bias corrected dcor
## -0.006558215
#notice the cyclical patterns with line chart and aspect ratio using coord_fixed (y/x)
ggplot(data = Mitchell, aes(x = Month, y = Temp)) +
chart_theme01 +
geom_point() +
geom_line(color = color_primary_pal[1], alpha = 0.3, size = 2) +
coord_fixed(ratio = 1) +
chart_footer +
scale_x_continuous(breaks = seq(0, 203, 12)) #increments of 12 months
#to compare monthly trends
ggplot(data = Mitchell, aes(x = Month%%12, y = Temp)) +
chart_theme01 +
geom_point(color = color_primary_pal[1], alpha = 0.5, size = 4) +
scale_x_continuous(breaks = seq(0, 13, 1)) +
chart_footer
###Noise and smootheming
pf.fc_by_age <- fb_df %>%
filter(age < 71) %>%
group_by(age) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
#plot mean friend count against age (yrs)
p1 <-
ggplot(data = pf.fc_by_age, aes(x = age, y = friend_count_mean)) +
chart_theme01 +
scale_y_continuous(limit = c(0,400)) +
scale_x_continuous(limit = c(0, 72)) +
geom_line(color = color_primary_pal[1], alpha = 1, size = 1) +
geom_smooth(size = 0.1, fill = grey(0.9)) +
chart_footer
pf.fc_by_age[15:20,]
## Source: local data frame [6 x 4]
##
## age friend_count_mean friend_count_median n
## 1 27 134.1473 72.0 2240
## 2 28 125.8354 66.0 2364
## 3 29 120.8182 66.0 1936
## 4 30 115.2080 67.5 1716
## 5 31 118.4599 63.0 1694
## 6 32 114.2800 63.0 1443
#mean friend count by age (year.month) to generate an even noisier plot
pf.fc_by_age_months <- fb_df %>%
filter(age < 71) %>%
mutate(age_with_months = age + (12 - dob_month)/12) %>%
group_by(age_with_months) %>%
summarise(friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
#plot mean friend count against age.month
p2 <-
ggplot(data = pf.fc_by_age_months, aes(x = age_with_months, y = friend_count_mean)) +
chart_theme01 +
scale_y_continuous(limit = c(0,400)) +
scale_x_continuous(limit = c(0, 72)) +
geom_line(color = color_primary_pal[2], alpha = 1, size = 0.25) +
geom_smooth(size = 0.1, fill = grey(0.9)) +
chart_footer
#Loess regression to smooth the mean friend count
p3 <-
fb_df %>%
filter(age < 71) %>%
ggplot(aes(x = round(age/5)*5, y = friend_count)) +
chart_theme01 +
geom_line(stat = "summary", fun.y = mean,
color = color_primary_pal[3], alpha = 1, size = 1) +
scale_x_continuous(limit = c(0, 72)) +
coord_cartesian(ylim = c(0, 400)) +
geom_smooth(size = 0.1, fill = grey(0.9)) +
chart_footer
grid.arrange(p2, p1, p3, ncol = 1)
ggplot(data = subset(fb_df, !is.na(gender)), aes(x = age, y = friend_count)) +
chart_theme01 +
geom_line(stat = "summary", fun.y = mean,
aes(color = gender)) +
#colScalegender +
scale_colour_brewer(palette="Set1") +
legend_top +
legend_title_hide +
legend_size_override() +
chart_footer +
ggtitle("Mean friend count by age and gender")
#mean friend count by age>genger
pf.fc_by_age_gender <-
fb_df %>%
filter(!is.na(gender)) %>%
group_by(age, gender) %>%
summarise(mean_friend_count = mean(friend_count),
median_friend_count = median(as.numeric(friend_count)),
n = n())
head(pf.fc_by_age_gender, 4)
## Source: local data frame [4 x 5]
## Groups: age
##
## age gender mean_friend_count median_friend_count n
## 1 13 female 259.1606 148.0 193
## 2 13 male 102.1340 55.0 291
## 3 14 female 362.4286 224.0 847
## 4 14 male 164.1456 92.5 1078
#median friend count by age for each gender
ggplot(data = pf.fc_by_age_gender, aes(x = age, y = median_friend_count)) +
chart_theme01 +
geom_line(aes(color = gender)) +
#colScalegender +
scale_colour_brewer(palette="Set1") +
legend_top +
legend_title_hide +
legend_size_override() +
chart_footer +
ggtitle("Median friend count by age and gender")
#spread to wide format key-value
pf.fc_by_age_gender_wide <- pf.fc_by_age_gender %>%
select(1, 2, 4) %>%
spread(gender, median_friend_count) %>%
mutate(ratio = male/female)
head(pf.fc_by_age_gender_wide, 3)
## Source: local data frame [3 x 4]
##
## age female male ratio
## 1 13 148 55.0 0.3716216
## 2 14 224 92.5 0.4129464
## 3 15 276 106.5 0.3858696
#plot ratio of median friends female / male
ggplot(data = pf.fc_by_age_gender_wide, aes(x = age, y = female/male)) +
chart_theme01 +
geom_line(color = color_primary_pal[2], alpha = 1, size = 0.75) +
coord_cartesian(ylim = c(0, 3)) +
geom_hline(yintercept =1, linetype = 2, size = 1, color = color_highlight_pal[1]) +
chart_footer
#note: one possible reason. More growth from newer countries. Initial users more likely to be male.
#create variable for year joined fb
fb_df$year_joined = 2014 - ceiling(fb_df$tenure/365)
summary(fb_df$year_joined)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2005 2012 2012 2012 2013 2014 2
#looks like most people joined in last 2 years
table(fb_df$year_joined)
##
## 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
## 9 15 581 1507 4557 5448 9860 33366 43588 70
#create bucketed variable for year joined
fb_df <- fb_df %>%
mutate(year_joined_bucket = cut(year_joined, c(2004, 2009, 2011, 2012, 2014)))
table(fb_df$year_joined_bucket)
##
## (2004,2009] (2009,2011] (2011,2012] (2012,2014]
## 6669 15308 33366 43658
#plot each year_join bucket for median friend count against age
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
chart_theme01 +
chart_footer +
geom_line(aes(color = year_joined_bucket),
stat = "summary", fun.y = median) +
scale_colour_brewer(palette="Set1") +
legend_top +
legend_size_override()
#Plot the grand mean as y intercept
#plot each year_join bucket for median friend count against age
ggplot(data = fb_df, aes(x = age, y = friend_count)) +
chart_theme01 +
chart_footer +
geom_line(aes(color = year_joined_bucket),
stat = "summary", fun.y = mean) +
scale_colour_brewer(palette="Set1") +
geom_line(stat = "summary", fun.y = mean, linetype = 2, aes(color = "Grand Mean")) +
annotate("text", x = 15, y = mean(fb_df$friend_count)*1.2, label = "Grand Mean") +
legend_top +
legend_size_override()
#Friendships initiated by tenure
ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1),
aes(x = tenure, y = friendships_initiated/tenure)) +
geom_line(aes(color = year_joined_bucket),
stat = "summary", fun.y = mean) +
chart_theme01 +
chart_footer +
legend_top + legend_size_override()
#It appears that users with higher tenure initiate less friendships
#Smoothening the tenure - bias variance tradeoff. as we increase the binwidth, the variance goes down and the bias goes up
ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1),
aes(x = round(tenure/7)*7, y = friendships_initiated/tenure)) +
geom_line(aes(color = year_joined_bucket),
stat = "summary", fun.y = mean) +
chart_theme01
ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1),
aes(x = round(tenure/30)*30, y = friendships_initiated/tenure)) +
geom_line(aes(color = year_joined_bucket),
stat = "summary", fun.y = mean) +
chart_theme01
ggplot(data = filter(fb_df, !is.na(tenure), !is.na(friendships_initiated), tenure >1, friendships_initiated >1),
aes(x = round(tenure/90)*90, y = friendships_initiated/tenure)) +
geom_line(aes(color = year_joined_bucket),
stat = "summary", fun.y = mean, , size = 1) +
chart_theme01
#using geom_smooth
ggplot(data = filter(fb_df, tenure >=1),
aes(x = tenure, y = friendships_initiated/tenure)) +
geom_smooth(aes(color = year_joined_bucket)) +
chart_theme01 +
scale_x_continuous(labels = comma)
yo <- read.csv("../data/yogurt.csv", header = T, stringsAsFactors = TRUE, sep = ",")
head(yo)
## obs id time strawberry blueberry pina.colada plain mixed.berry
## 1 1 2100081 9678 0 0 0 0 1
## 2 2 2100081 9697 0 0 0 0 1
## 3 3 2100081 9825 0 0 0 0 1
## 4 4 2100081 9999 0 0 0 0 1
## 5 5 2100081 10015 1 0 1 0 1
## 6 6 2100081 10029 1 0 2 0 1
## price
## 1 58.96
## 2 58.96
## 3 65.04
## 4 65.04
## 5 48.96
## 6 65.04
str(yo)
## 'data.frame': 2380 obs. of 9 variables:
## $ obs : int 1 2 3 4 5 6 7 8 9 10 ...
## $ id : int 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 2100081 ...
## $ time : int 9678 9697 9825 9999 10015 10029 10036 10042 10083 10091 ...
## $ strawberry : int 0 0 0 0 1 1 0 0 0 0 ...
## $ blueberry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pina.colada: int 0 0 0 0 1 2 0 0 0 0 ...
## $ plain : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mixed.berry: int 1 1 1 1 1 1 1 1 1 1 ...
## $ price : num 59 59 65 65 49 ...
#change id to factor variable from int
yo$id <- as.factor(yo$id)
#Price histogram
ggplot(data = yo, aes(x = price)) +
chart_theme01 +
geom_histogram(binwidth = 1, fill = color_primary_pal[1])
#the price histogram indicates discreteness
length(unique(yo$price))
## [1] 20
#only 20 unique prices
#new variable for total # of purchases in a transaction
yo <- yo %>%
mutate(all.purchases = strawberry + blueberry + pina.colada + plain + mixed.berry)
summary(yo$all.purchases)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 1.971 2.000 21.000
#histogram of all.purchases
ggplot(data = yo, aes(x = all.purchases))+
chart_theme01 +
geom_histogram(fill = color_primary_pal[1], binwidth = 1)
#scatterplot of price vs time
ggplot(data = yo, aes(x = time, y = price))+
chart_theme01 +
geom_point(color = color_primary_pal[1], alpha = 0.5)
#the most common prices seem to be increasing over time
#the scattered lower prices could be due to markdowns or usage of coupons
#set the seed for reproducible samples
set.seed(4230)
#draw 16 households
sample.ids <- sample(levels(yo$id), 16)
#scatterplot of price vs time
ggplot(data = filter(yo, id %in% sample.ids), aes(x = time, y = price))+
chart_theme01 +
geom_point(color = color_primary_pal[1], alpha = 0.5, aes(size = all.purchases)) +
geom_line(color = color_highlight_pal[1], size = .5, alpha = 0.8) +
legend_top +
facet_wrap(~id) +
ggtitle("# purchases and price trend over time for 16 random households")
#set the seed for reproducible samples
set.seed(1836)
#extract columns 2:5 from fb dataframe
pf_subset <- fb_df[, c(2:5)]
ggpairs(pf_subset[sample.int(nrow(pf_subset), 100), ],
params=list(corSize=4, fontsize = 4))